This document contains analyses of BERT cosine distances on zeugmatic sentences with ambiguous words, and compares those distances to human similarity judgments.
First, we load the data with summary statistics about each item. We also load the BERT cosine distances.
### Set working directory (comment this out to run)
# setwd("/Users/seantrott/Dropbox/UCSD/Research/Ambiguity/SSD/zeugma_norms/src/analysis")
### Load norming data
df_normed = read_csv("../../data/raw/similarity.csv")
## Parsed with column specification:
## cols(
## CW = col_character(),
## String = col_character(),
## `Dominance Score` = col_character(),
## `Alternative Dominance Score from Armstrong et al. (2012) EDOM` = col_character(),
## `Similarity Norming Category` = col_character(),
## `Zeugmatic Similarity Norming Sentence` = col_character(),
## Anaphora = col_character(),
## `Similarity Mean` = col_double(),
## `Similarity STDEV` = col_double(),
## `Similarity SEM` = col_double()
## )
nrow(df_normed)
## [1] 320
df_normed = df_normed %>%
mutate(word = tolower(CW))
### Load BERT distances
df_bert = read_csv("../../data/processed/distances.csv") %>%
select(-X1)
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## `Similarity Norming Category` = col_character(),
## distance_bert_large_hf_layer_1 = col_double(),
## distance_bert_large_hf_layer_10 = col_double(),
## distance_bert_large_hf_layer_11 = col_double(),
## distance_bert_large_hf_layer_12 = col_double(),
## distance_bert_large_hf_layer_2 = col_double(),
## distance_bert_large_hf_layer_3 = col_double(),
## distance_bert_large_hf_layer_4 = col_double(),
## distance_bert_large_hf_layer_5 = col_double(),
## distance_bert_large_hf_layer_6 = col_double(),
## distance_bert_large_hf_layer_7 = col_double(),
## distance_bert_large_hf_layer_8 = col_double(),
## distance_bert_large_hf_layer_9 = col_double(),
## string = col_character(),
## word = col_character()
## )
nrow(df_bert)
## [1] 314
We then merge them together:
df_merged = df_normed %>%
inner_join(df_bert, by = c('word', 'Similarity Norming Category')) %>%
mutate(ambiguity_type = `Similarity Norming Category`,
sim = `Similarity Mean`)
nrow(df_merged)
## [1] 314
We know that the similarity scores reflect the underlying Ambiguity Type.
df_merged %>%
ggplot(aes(x = ambiguity_type,
y = sim,
fill = ambiguity_type)) +
geom_boxplot() +
labs(x = "Ambiguity Type",
y = "Similarity Judgment",
fill = "Ambiguity Type") +
theme_minimal()
df_merged %>%
ggplot(aes(x = sim,
y = ambiguity_type,
fill = ambiguity_type)) +
geom_density_ridges2(aes(height = ..density..),
color=gray(0.25),
alpha = 0.5,
scale=0.85,
size=.9,
stat="density") +
labs(x = "Similarity Judgment",
y = "Ambiguity type") +
theme_minimal()
However, this effect seems considerably weaker for the cosine distance measures:
df_merged %>%
ggplot(aes(x = ambiguity_type,
y = distance_bert_large_hf_layer_12,
fill = ambiguity_type)) +
geom_boxplot() +
labs(x = "Ambiguity Type",
y = "Cosine Distance (Final Layer)",
fill = "Ambiguity Type") +
theme_minimal()
df_merged %>%
ggplot(aes(x = distance_bert_large_hf_layer_12,
y = ambiguity_type,
fill = ambiguity_type)) +
geom_density_ridges2(aes(height = ..density..),
color=gray(0.25),
alpha = 0.5,
scale=0.85,
size=.9,
stat="density") +
labs(x = "Cosine Distance (Final Layer)",
y = "Ambiguity type",
fill = "Ambiguity Type") +
theme_minimal()
We find that a model predicting Distance (Final layer) with Ambiguity Type, and a random intercept for Anaphora, explains more variance than a model with only the random intercept.
model_full = lmer(data = df_merged,
distance_bert_large_hf_layer_12 ~ ambiguity_type + (1 | Anaphora),
REML = FALSE)
model_reduced = lmer(data = df_merged,
distance_bert_large_hf_layer_12 ~ (1 | Anaphora),
REML = FALSE)
summary(model_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: distance_bert_large_hf_layer_12 ~ ambiguity_type + (1 | Anaphora)
## Data: df_merged
##
## AIC BIC logLik deviance df.resid
## -589.7 -567.2 300.9 -601.7 308
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.2789 -0.6327 -0.0599 0.6889 3.1911
##
## Random effects:
## Groups Name Variance Std.Dev.
## Anaphora (Intercept) 0.035720 0.18900
## Residual 0.006571 0.08106
## Number of obs: 314, groups: Anaphora, 30
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.43830 0.03757 31.44282 11.668 5.85e-13 ***
## ambiguity_typeIP -0.02024 0.01340 286.37763 -1.510 0.1321
## ambiguity_typeRP -0.03551 0.01467 288.04514 -2.421 0.0161 *
## ambiguity_typeUA -0.03454 0.01448 286.50318 -2.385 0.0177 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) amb_IP amb_RP
## ambgty_tyIP -0.204
## ambgty_tyRP -0.225 0.454
## ambgty_tyUA -0.199 0.443 0.530
anova(model_full, model_reduced)
## Data: df_merged
## Models:
## model_reduced: distance_bert_large_hf_layer_12 ~ (1 | Anaphora)
## model_full: distance_bert_large_hf_layer_12 ~ ambiguity_type + (1 | Anaphora)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model_reduced 3 -588.27 -577.02 297.14 -594.27
## model_full 6 -589.74 -567.24 300.87 -601.74 7.4658 3 0.05844 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df_merged %>%
group_by(ambiguity_type) %>%
summarise(mean_distance = mean(distance_bert_large_hf_layer_12),
sd_distance = sd(distance_bert_large_hf_layer_12))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 3
## ambiguity_type mean_distance sd_distance
## <chr> <dbl> <dbl>
## 1 H 0.515 0.0951
## 2 IP 0.497 0.123
## 3 RP 0.464 0.121
## 4 UA 0.455 0.105
Here, we correlate cosine distance with similarity judgments, and analyze this across all layers of BERT.
df_all_layers = data.frame()
for (layer in 1:12) {
col_name = paste("distance_bert_large_hf_layer", layer, sep="_")
col = df_merged[[col_name]]
r = cor.test(col, df_merged$sim)
df_r = broom::tidy(r)
df_r$layer = layer
df_all_layers = rbind(df_all_layers, df_r)
}
df_all_layers %>%
ggplot(aes(x = layer,
y = estimate)) +
geom_line() +
geom_errorbar(aes(ymin = conf.high,
ymax = conf.low),
width=.2,
position=position_dodge(.9)) +
labs(x = "Layer",
y = "Correlation between distance and similarity") +
theme_minimal() +
theme(axis.title = element_text(size=rel(2)),
axis.text = element_text(size = rel(2)),
legend.text = element_text(size = rel(2)),
legend.title = element_text(size = rel(2)))
ggsave("../../Figures/r_layers.png", dpi = 300)
## Saving 7 x 5 in image
df_all_layers %>%
filter(estimate == min(df_all_layers$estimate))
## # A tibble: 1 x 9
## estimate statistic p.value parameter conf.low conf.high method alternative
## <dbl> <dbl> <dbl> <int> <dbl> <dbl> <chr> <chr>
## 1 -0.231 -4.20 3.44e-5 312 -0.334 -0.124 Pears… two.sided
## # … with 1 more variable: layer <int>
## Now view layer
df_merged %>%
ggplot(aes(x = distance_bert_large_hf_layer_12,
y = sim,
color = ambiguity_type,
shape = ambiguity_type)) +
geom_point(alpha = .6, size = 2) +
labs(x = "Cosine Distance",
y = "Similarity Judgment",
color = "Ambiguity Type",
shape = "Ambiguity Type") +
theme_minimal()
We also asked whether Distance (Layer 12) improves a model above and beyond ambiguity type.
model_full = lmer(data = df_merged,
sim ~ ambiguity_type + distance_bert_large_hf_layer_12 + (1 | Anaphora),
REML = FALSE)
model_reduced = lmer(data = df_merged,
sim ~ ambiguity_type + (1 | Anaphora),
REML = FALSE)
summary(model_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: sim ~ ambiguity_type + distance_bert_large_hf_layer_12 + (1 |
## Anaphora)
## Data: df_merged
##
## AIC BIC logLik deviance df.resid
## 819.9 846.2 -403.0 805.9 307
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.7912 -0.4819 0.0042 0.4082 3.9082
##
## Random effects:
## Groups Name Variance Std.Dev.
## Anaphora (Intercept) 0.01073 0.1036
## Residual 0.75476 0.8688
## Number of obs: 314, groups: Anaphora, 30
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 2.0842 0.2547 233.6958 8.182 1.79e-14
## ambiguity_typeIP 0.9214 0.1389 312.9471 6.634 1.44e-10
## ambiguity_typeRP 3.2300 0.1466 237.5380 22.035 < 2e-16
## ambiguity_typeUA 4.5779 0.1457 271.2706 31.431 < 2e-16
## distance_bert_large_hf_layer_12 -0.7241 0.4500 288.9129 -1.609 0.109
##
## (Intercept) ***
## ambiguity_typeIP ***
## ambiguity_typeRP ***
## ambiguity_typeUA ***
## distance_bert_large_hf_layer_12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) amb_IP amb_RP amb_UA
## ambgty_tyIP -0.338
## ambgty_tyRP -0.425 0.493
## ambgty_tyUA -0.443 0.496 0.538
## dstn_____12 -0.893 0.059 0.131 0.155
anova(model_full, model_reduced)
## Data: df_merged
## Models:
## model_reduced: sim ~ ambiguity_type + (1 | Anaphora)
## model_full: sim ~ ambiguity_type + distance_bert_large_hf_layer_12 + (1 |
## model_full: Anaphora)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model_reduced 6 820.49 842.99 -404.25 808.49
## model_full 7 819.93 846.18 -402.97 805.93 2.5644 1 0.1093
Here, we used a masked langauge modeling task to measure the probability of observing the anaphoric word in that position. This is meant to approximate measures of a subject's experience of encountering that anaphoric word, such as RT or the N400 effect.
df_surprisal = read_csv("../../data/processed/surprisals.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## `Similarity Norming Category` = col_character(),
## probability = col_double(),
## string = col_character(),
## word = col_character()
## )
nrow(df_surprisal)
## [1] 314
df_merged = df_merged %>%
inner_join(df_surprisal, by = c('word', 'Similarity Norming Category')) %>%
mutate(surprisal = -log(probability))
nrow(df_merged)
## [1] 314
df_merged %>%
ggplot(aes(x = surprisal,
y = ambiguity_type,
fill = ambiguity_type)) +
geom_density_ridges2(aes(height = ..density..),
color=gray(0.25),
alpha = 0.5,
scale=0.85,
size=.9,
stat="density") +
labs(x = "Surprisal of Masked Word",
y = "Ambiguity type",
fill = "Ambiguity Type") +
theme_minimal() +
theme(axis.title = element_text(size=rel(2)),
axis.text = element_text(size = rel(2)),
legend.text = element_text(size = rel(2)),
legend.title = element_text(size = rel(2)))
ggsave("../../Figures/surprisal_condition.png", dpi = 300)
## Saving 7 x 5 in image
model_full = lmer(data = df_merged,
surprisal ~ ambiguity_type + (1 | Anaphora),
REML = FALSE)
model_reduced = lmer(data = df_merged,
surprisal ~ (1 | Anaphora),
REML = FALSE)
summary(model_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: surprisal ~ ambiguity_type + (1 | Anaphora)
## Data: df_merged
##
## AIC BIC logLik deviance df.resid
## 1170.5 1193.0 -579.3 1158.5 308
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.5055 -0.5250 0.0620 0.6646 2.5062
##
## Random effects:
## Groups Name Variance Std.Dev.
## Anaphora (Intercept) 5.829 2.414
## Residual 1.874 1.369
## Number of obs: 314, groups: Anaphora, 30
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 6.48397 0.50587 35.98690 12.817 5.59e-15 ***
## ambiguity_typeIP -0.08125 0.22576 290.58920 -0.360 0.71918
## ambiguity_typeRP -0.68832 0.24683 292.90675 -2.789 0.00564 **
## ambiguity_typeUA -0.40211 0.24386 291.16813 -1.649 0.10024
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) amb_IP amb_RP
## ambgty_tyIP -0.255
## ambgty_tyRP -0.282 0.457
## ambgty_tyUA -0.251 0.445 0.530
anova(model_full, model_reduced)
## Data: df_merged
## Models:
## model_reduced: surprisal ~ (1 | Anaphora)
## model_full: surprisal ~ ambiguity_type + (1 | Anaphora)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model_reduced 3 1173.5 1184.7 -583.74 1167.5
## model_full 6 1170.5 1193.0 -579.27 1158.5 8.9396 3 0.0301 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df_merged %>%
group_by(ambiguity_type) %>%
summarise(mean_surprisal = mean(surprisal),
sd_surprisal = sd(surprisal))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 3
## ambiguity_type mean_surprisal sd_surprisal
## <chr> <dbl> <dbl>
## 1 H 6.42 1.73
## 2 IP 6.56 2.17
## 3 RP 5.06 2.17
## 4 UA 5.19 1.93
We also asked whether the surprisal of the anaphoric word was predictive of similarity above and beyond the ambiguity type category, and found that it was.
model_full = lmer(data = df_merged,
sim ~ surprisal + ambiguity_type + (1 | Anaphora),
REML = FALSE)
model_reduced = lmer(data = df_merged,
sim ~ ambiguity_type + (1 | Anaphora),
REML = FALSE)
summary(model_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: sim ~ surprisal + ambiguity_type + (1 | Anaphora)
## Data: df_merged
##
## AIC BIC logLik deviance df.resid
## 809.7 835.9 -397.8 795.7 307
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9800 -0.5274 -0.0347 0.4479 3.6869
##
## Random effects:
## Groups Name Variance Std.Dev.
## Anaphora (Intercept) 0.008238 0.09077
## Residual 0.731757 0.85543
## Number of obs: 314, groups: Anaphora, 30
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 2.28351 0.19353 51.75943 11.799 2.72e-16 ***
## surprisal -0.09208 0.02537 84.17449 -3.629 0.000487 ***
## ambiguity_typeIP 0.94693 0.13649 311.60784 6.938 2.31e-11 ***
## ambiguity_typeRP 3.17344 0.14487 204.79796 21.905 < 2e-16 ***
## ambiguity_typeUA 4.53230 0.14325 239.82428 31.638 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) srprsl amb_IP amb_RP
## surprisal -0.820
## ambgty_tyIP -0.349 -0.023
## ambgty_tyRP -0.541 0.184 0.479
## ambgty_tyUA -0.530 0.172 0.483 0.541
anova(model_full, model_reduced)
## Data: df_merged
## Models:
## model_reduced: sim ~ ambiguity_type + (1 | Anaphora)
## model_full: sim ~ surprisal + ambiguity_type + (1 | Anaphora)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model_reduced 6 820.49 842.99 -404.25 808.49
## model_full 7 809.67 835.92 -397.84 795.67 12.822 1 0.0003427 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df_merged %>%
ggplot(aes(x = surprisal,
y = sim,
color = ambiguity_type,
shape = ambiguity_type)) +
geom_point(alpha = .6, size = 2) +
labs(x = "Surprisal of Masked Word",
y = "Similarity Judgment",
color = "Ambiguity Type",
shape = "Ambiguity Type") +
theme_minimal()
cor.test(df_merged$surprisal, df_merged$sim)
##
## Pearson's product-moment correlation
##
## data: df_merged$surprisal and df_merged$sim
## t = -6.6958, df = 312, p-value = 9.982e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4475856 -0.2537325
## sample estimates:
## cor
## -0.3544618